For the selected models, prepare a knitr/jupyter notebook based on the following points (you can use models created in Homework 1).
Submit your results on GitHub to the directory Homeworks/HW2.
The notebook that this notebook bases on can be found at Homeworks/HW1/PawelPawlik/main.ipynb.
task_A_4(True)
For the first observation the highest importance variables are: cp_0 and caa, while for the second observation the highest importance variables are: thall and oldpeak. We can see that actually both observations have high importance in cp_0, caa and thall (top 4 variables in both), so we guess that those variables may be important across the dataset. However variable is top 1 important variable in the other is seems to much less important. This shows that there exists variables that can be important to the result only in some context (relation to other variables).
task_A_56(True)
We can see that there indeed exists a variable that has positive contribution in some observation and negative in other. This is not surprising as deepening on value and context variable may have opposite influence on the final score.
We can see that dalex top 2 contribution variables accross selected two observations are similiar (there is only 1 difference caa vs oldpeak in second observetion). This shows that there may be significant difference in analysis across multiple packges (more on this in the comment to the point 7 below).
Results in point 5 are similar which could be expected but not certain (it would be not surprising to see opposite results as those packages need to approximate shap values, so the results could be more significantly different).
task_A_7()
We can see that different models may have different shap attribution. Moreover, the most significant variables may be different (in the above example top 2 variables are caa nad thall in one observation, and caa and oldpeak in the other). This shows that we need to be careful when analyzing shap analysis results as the analysis does not present objective facts about dataset but the interpretations are significantly influenced by the model that was used. This is expected as at the end of the day we analyze the model.
Calculate Shapley values for player A given the following value function. The method used for this calculations can be found on slides.
$$ v() = 0 \\ v(A) = 20, \quad A:+20 \\ v(B) = 20, \quad B:+20 \\ v(C) = 60, \quad C:+60 \\ v(A,B) = 60, \quad A:+40, B:+40 \\ v(A,C) = 70, \quad A:+10, C:+50 \\ v(B,C) = 70, \quad B:+10, C:+50 \\ v(A,B,C) = 100, \quad A:+30, B:+30, C:+40 $$$$ \phi_A = \frac{1}{6}(20\cdot2 + 40 + 10 + 30\cdot2) = \frac{1}{6} 150 = 25 \\ \phi_B = \frac{1}{6}(20\cdot2 + 40 + 10 + 30\cdot2) = \frac{1}{6} 150 = 25 \\ \phi_C = \frac{1}{6}(60\cdot2 + 50 + 50 + 40\cdot2) = \frac{1}{6} 300 = 50 $$Source code
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import RocCurveDisplay, roc_curve
import seaborn as sns
from sklearn.metrics import roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import shap
import dalex as dx
import plotly.io as pio
pio.renderers.default = "notebook"
np.random.seed(42)
OUTPUT_COLUMN = "output"
I trained RandomForestClassifier in the first homework, so I will be using this model.
def get_preprocessed_dataset():
df = pd.read_csv("heart.csv")
df = pd.get_dummies(df, columns=["cp", "restecg"])
scaler = StandardScaler()
X, y = df.drop(columns=[OUTPUT_COLUMN]), df[OUTPUT_COLUMN]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)
X_train[:] = scaler.fit_transform(X_train)
X_test[:] = scaler.transform(X_test)
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = get_preprocessed_dataset()
model = RandomForestClassifier().fit(X_train, y_train)
np.random.seed(123)
idxs = np.random.choice(len(X_test), 2)
X, y = X_test.iloc[idxs], y_test.iloc[idxs]
y_hat = model.predict_proba(X)[:, 1]
y_hat
I will use dalex and shap packages.
shap_explainer = shap.Explainer(model, X_train)
shap_values = shap_explainer(X, check_additivity=False)
for v in shap_values:
shap.plots.waterfall(v[:, 1])
dx_explainer = dx.Explainer(model, X_train, y_train, verbose=False)
for _, row in X.iterrows():
dx_explainer.predict_parts(row).plot()
def task_A_4(run_dalex=False):
shap_values = shap_explainer(X_test, check_additivity=False)
idx_obs_1 = 0
shap_obs_1 = shap_values[idx_obs_1, : , 1]
high_obs_1 = np.argsort(np.abs(shap_obs_1.values))[-2:]
for i, v in enumerate(shap_values):
idx_obs_2 = i
shap_obs_2 = shap_values[idx_obs_2, : , 1]
high_obs_2 = np.argsort(np.abs(shap_obs_2.values))[-2:]
if (high_obs_1 == high_obs_2).any() or (high_obs_1 == high_obs_2[::-1]).any():
continue
break
else: # no break
raise Exception("matching observation not found")
print(f"Chosen observations: {idx_obs_1,idx_obs_2}")
print(f"2 highest attributions (shap): {X.columns[high_obs_1].tolist(),X.columns[high_obs_2].tolist()}")
print("\n waterfall plot for first observation")
shap.plots.waterfall(shap_obs_1)
print("\n waterfall plot for second observation")
shap.plots.waterfall(shap_obs_2)
if not run_dalex:
return
dx_obs_1 = dx_explainer.predict_parts(X_test.iloc[idx_obs_1]).result.iloc[1:-1] # [1:-1] drop intercept and prediction
dx_obs_2 = dx_explainer.predict_parts(X_test.iloc[idx_obs_2]).result.iloc[1:-1]
dx_obs_1["contribution"] = dx_obs_1["contribution"].abs()
dx_obs_2["contribution"] = dx_obs_2["contribution"].abs()
dx_obs_1 = dx_obs_1.sort_values("contribution", ascending=False)
dx_obs_2 = dx_obs_2.sort_values("contribution", ascending=False)
print("2 highest attributions (dalex):", dx_obs_1["variable_name"][:2].tolist(), dx_obs_2["variable_name"][:2].tolist())
dx_explainer.predict_parts(X_test.iloc[idx_obs_1]).plot()
dx_explainer.predict_parts(X_test.iloc[idx_obs_2]).plot()
task_A_4(True)
def task_A_56(run_dalex=False):
shap_values = shap_explainer(X_test, check_additivity=False)
var = 0
idx_obs_1 = 0
shap_obs_1 = shap_values[idx_obs_1, : , 1]
var_obs_1 = shap_obs_1.values[var]
for i, v in enumerate(shap_values):
idx_obs_2 = i
shap_obs_2 = shap_values[idx_obs_2, : , 1]
var_obs_2 = shap_obs_2.values[var]
if var_obs_1 * var_obs_2 >= 0:
continue
break
else: # no break
raise Exception("matching observation not found")
print(f"Chosen variable X: {X.columns[var]}")
print(f"Chosen observations: {idx_obs_1,idx_obs_2}")
print(f"Shap contributions for chosen observations: {var_obs_1,var_obs_2}")
if not run_dalex:
return
dx_contr_obs_1 = dx_explainer.predict_parts(X_test.iloc[idx_obs_1]).result.set_index("variable_name").loc[X.columns[var]]["contribution"]
dx_contr_obs_2 = dx_explainer.predict_parts(X_test.iloc[idx_obs_2]).result.set_index("variable_name").loc[X.columns[var]]["contribution"]
print(f"Dalex contributions for chosen observations: {dx_contr_obs_1,dx_contr_obs_2}")
task_A_56(True)
def task_A_7():
shap_values = shap_explainer(X_test, check_additivity=False)
model2 = LogisticRegression().fit(X_train, y_train)
shap_values2 = shap.Explainer(model2, X_train)(X_test)
diff_shap_values = np.abs(shap_values[:, :, 1].values - shap_values2.values)
most_diff_obs = np.argmax(diff_shap_values.sum(-1))
print(f"{most_diff_obs=}")
print("\n waterfall plot for RandomForest")
shap.plots.waterfall(shap_values[most_diff_obs, :, 1])
print("\n waterfall plot for LogisticRegression")
shap.plots.waterfall(shap_values2[most_diff_obs])
task_A_7()